knitr::include_graphics("hw2_q1.png")
We use \(\chi^2_{2,0.5}\) to indicate that the density contour will have 50% of the probability, and this value will equal \(c^2\). We know our major axis is \(c\sqrt{\lambda_1}e_1\) and our minor axis is \(c\sqrt{\lambda_2}e_2\) from the center, where \(\lambda_1\) and \(\lambda_2\) are eigenvalues of the covariance matrix and \(\lambda_1 > \lambda_2\).
c_squared <- qchisq(0.5,2)
c <- sqrt(c_squared)
covariance <- rbind(c(2, -3*sqrt(3)/5), c(-3*sqrt(3)/5, 1.5))
eigen(covariance)
## eigen() decomposition
## $values
## [1] 2.8188779 0.6811221
##
## $vectors
## [,1] [,2]
## [1,] -0.7854585 -0.6189143
## [2,] 0.6189143 -0.7854585
major_axis <- c*sqrt(eigen(covariance)$values[1])*eigen(covariance)$vectors[,1]
minor_axis <- c*sqrt(eigen(covariance)$values[2])*eigen(covariance)$vectors[,2]
# these are vectors from the center
major_axis
## [1] -1.552706 1.223479
minor_axis
## [1] -0.6014101 -0.7632441
Now, we plot the constant-density contour that contains 50% of the probability.
library(MASS)
library(ellipse)
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
##
## pairs
set.seed(605574052)
mean <- c(3,1)
ellipse <- t(t(ellipse(covariance, level=0.5, npoints=1000)) + mean)
# create a random sample of 3000 observations following our MVN distribution
X <- mvrnorm(n=3000, mu=mean, Sigma=covariance)
plot(X, xlab = "X", ylab = "Y", asp=0.9)
lines(ellipse, col="blue")
points(mean[1], mean[2], col="blue")
segments(mean[1], mean[2], mean[1]+major_axis[1], mean[2]+major_axis[2], col="blue")
segments(mean[1], mean[2], mean[1]+minor_axis[1], mean[2]+minor_axis[2], col="blue")
knitr::include_graphics("hw2_q2_1.png")
knitr::include_graphics("hw2_q2_2.png")
knitr::include_graphics("hw2_q3_1.png")
knitr::include_graphics("hw2_q3_2.png")
knitr::include_graphics("hw2_q4_1.png")
knitr::include_graphics("hw2_q4_2.png")
x_1 <- c(1, 1, 3, 3, 4, 5, 5, 7, 9, 10, 13)
x_2 <- c(17.95, 19, 17.85, 16.5, 14.1, 12.15, 11, 9.35, 6, 4.15, 2.99)
X <- cbind(x_1, x_2)
# getting our sample mean and covariance matrix
mu <- c(mean(x_1), mean(x_2))
sigma <- cov(X)
Now, we calculate the squared statistical distances using the formula \((x_i-\bar{x})^TS^{-1}(x_i-\bar{x})\).
# the distances are in order
dist <- data.frame(x=numeric(), y=numeric())
dist_func <- function(X, mu, sigma) {
for (i in 1:nrow(X)) {
x <- i
y <- diag(t(X[i,] - mu) %*% solve(sigma) %*% (X[i,] - mu))
dist <- rbind(dist, data.frame(x=x, y=y))
}
dist
}
output <- dist_func(X, mu, sigma)
output
## x y
## 1 1 1.6457872
## 2 2 1.5278722
## 3 3 3.5264271
## 4 4 0.9103886
## 5 5 0.1662493
## 6 6 0.2185317
## 7 7 1.8649088
## 8 8 0.2630483
## 9 9 1.2769104
## 10 10 2.3281935
## 11 11 6.2716829
output[,2] < qchisq(0.5,2)
## [1] FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE
We observe that 5 out of the 11 observations are less than the chi-squared value above, meaning these 5 points will fall within the estimated 50% probability contour of a bivariate normal distribution.
ordered_dist <- output[order(output$y),]
plot(qchisq((1:11-0.5)/11, 2), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")
According to the above parts, we have that the data is approximately bivariate normal because about half of our observations (\(\frac{5}{11}\)) lie within the 50% probability contour, indicating a good bivariate normality of our observations. Moreover, observing the chi-square plot, the points seems to loosely follow the curve you would normally see in a chi-square plot, supporting the normality claim.
ceramic <- read.csv("Chemical Composion of Ceramic.csv")
head(ceramic)
## Ceramic.Name Part Na2O MgO Al2O3 SiO2 K2O CaO TiO2 Fe2O3 MnO CuO ZnO PbO2
## 1 FLQ-1-b Body 0.62 0.38 19.61 71.99 4.84 0.31 0.07 1.18 630 10 70 10
## 2 FLQ-2-b Body 0.57 0.47 21.19 70.09 4.98 0.49 0.09 1.12 380 20 80 40
## 3 FLQ-3-b Body 0.49 0.19 18.60 74.70 3.47 0.43 0.06 1.07 420 20 50 50
## 4 FLQ-4-b Body 0.89 0.30 18.01 74.19 4.01 0.27 0.09 1.23 460 20 70 60
## 5 FLQ-5-b Body 0.03 0.36 18.41 73.99 4.33 0.65 0.05 1.19 380 40 90 40
## 6 FLQ-6-b Body 0.62 0.18 18.82 73.79 4.28 0.30 0.04 0.96 350 20 80 10
## Rb2O SrO Y2O3 ZrO2 P2O5
## 1 430 0 40 80 90
## 2 430 -10 40 100 110
## 3 380 40 40 80 200
## 4 380 10 40 70 210
## 5 360 10 30 80 150
## 6 390 10 40 80 130
# we observe the marginal normality first:
qqnorm(ceramic$MnO)
qqnorm(ceramic$CuO)
qqnorm(ceramic$ZrO2)
qqnorm(ceramic$P2O5)
Observing the four qq-plots for the four variables we are interested in, we notice that the first and the last plots hint at a non-normality of the univariate data as the points in those plots do not really form a straight line. However, in the second and the third plots, there is a better linear association.
# now, we observe the multivariate normality:
X <- cbind(ceramic$MnO, ceramic$CuO, ceramic$ZrO2, ceramic$P2O5)
mu <- c(mean(ceramic$MnO), mean(ceramic$CuO), mean(ceramic$ZrO2), mean(ceramic$P2O5))
sigma <- cov(X)
output <- dist_func(X, mu, sigma)
ordered_dist <- output[order(output$y),]
plot(qchisq((1:88-0.5)/88, 4), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")
Looking at the above chi-square plot, there doesn’t seem to be good multivariate normality either because the points do not really follow the expected curve that we would normally spot in a chi-square plot. But, perhaps this plot would look better if we got rid of some of the outliers? This will be explained in part b).
First, we will try to remove some of the outliers (points with a squared distance \(>15\)), then plot the chi-square plot again.
# the outliers are:
ordered_dist[86:88,]
## x y
## 83 83 16.38079
## 43 43 17.05710
## 52 52 19.66541
# now we remove these points:
ordered_dist <- ordered_dist[1:85,]
plot(qchisq((1:85-0.5)/85, 4), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")
This certainly is a better looking plot to analyze. Here, we notice a plot that resembles the desired shape so this shows good multivariate normality. Now, we will perform Box-Cox transformation for the univariate data:
pwr1 <- MASS::boxcox(lm(ceramic$MnO~1))
pwr4 <- MASS::boxcox(lm(ceramic$P2O5~1))
# the following will return the values that maximizes the log-likelihood
lambda_1 <- pwr1$x[which.max(pwr1$y)]
lambda_4 <- pwr4$x[which.max(pwr4$y)]
Now, we create a new set of qqplots based on the derived numbers above.
qqnorm((ceramic$MnO^lambda_1 - 1) / lambda_1)
qqnorm((ceramic$P2O5^lambda_4 - 1) / lambda_4)
In the modified qq-plots above, there seems to be a better linear trend, indicating a better normality of the univariate data.
library(ICSNP)
## Loading required package: mvtnorm
## Loading required package: ICS
data(LASERI)
head(LASERI)
## Sex Age Height Weight Waist Hip PWVT3 PWVT4 BMI WHR HRT1 HRT2 HRT3
## 1 Male 33 181 88 86.9 102.0 17.1 10.9 26.9 0.852 75 80 91
## 2 Female 30 152 58 76.5 91.5 13.2 7.4 25.1 0.836 65 65 82
## 3 Female 33 174 104 85.4 114.5 13.3 8.1 34.4 0.746 67 73 79
## 4 Female 36 168 81 90.6 107.7 16.0 16.0 28.7 0.841 61 75 74
## 5 Female 36 172 90 88.1 107.8 15.8 9.0 30.4 0.817 68 76 80
## 6 Female 33 165 101 124.7 118.1 10.8 8.8 37.1 1.056 57 73 67
## HRT4 COT1 COT2 COT3 COT4 SVRIT1 SVRIT2 SVRIT3 SVRIT4 PWVT1 PWVT2 HRT1T4
## 1 64 7.02 5.92 6.21 5.67 2228 2983 2196 2698 10.6 19.7 11
## 2 65 5.60 3.74 4.07 5.27 1827 3085 2981 1957 7.2 11.6 0
## 3 66 7.55 5.32 5.75 6.19 1946 2828 2676 2353 7.0 11.1 1
## 4 74 6.03 4.39 4.75 4.75 2103 3024 3118 2690 9.1 16.7 -13
## 5 64 7.98 5.05 5.05 6.91 1757 2928 2882 2062 9.0 15.6 4
## 6 49 4.23 4.17 3.82 3.51 3251 3718 4035 3886 8.3 11.4 8
## COT1T4 SVRIT1T4 PWVT1T4 HRT1T2 COT1T2 SVRIT1T2 PWVT1T2
## 1 1.35 -470 -0.3 -5 1.10 -755 -9.1
## 2 0.33 -130 -0.2 0 1.86 -1258 -4.4
## 3 1.36 -407 -1.1 -6 2.23 -882 -4.1
## 4 1.28 -587 -6.9 -14 1.64 -921 -7.6
## 5 1.07 -305 0.0 -8 2.93 -1171 -6.6
## 6 0.72 -635 -0.5 -16 0.06 -467 -3.1
# we observe the marginal normality first:
qqnorm(LASERI$HRT1)
qqnorm(LASERI$HRT2)
qqnorm(LASERI$HRT3)
qqnorm(LASERI$HRT4)
From the above four qq-plots, we notice that all the qq-plots seem to hint at good normality as most points follow a straight linear trend. So, we move on to observe the multivariate normality by constructing a chi-square plot.
X <- cbind(LASERI$HRT1, LASERI$HRT2, LASERI$HRT3, LASERI$HRT4)
mu <- c(mean(LASERI$HRT1), mean(LASERI$HRT2), mean(LASERI$HRT3), mean(LASERI$HRT4))
sigma <- cov(X)
output <- dist_func(X, mu, sigma)
ordered_dist <- output[order(output$y),]
plot(qchisq((1:223-0.5)/223, 4), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")
There clearly seems to be 2 outliers that are messing with the shape of the above plot. Thus, we will locate and remove these outliers and plot the chi-square plot again.
# the outliers are:
ordered_dist[c(222,223),]
## x y
## 66 66 33.31074
## 219 219 53.67008
# now we remove these points:
ordered_dist <- ordered_dist[1:221,]
plot(qchisq((1:221-0.5)/221, 4), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")
Our chi-square here looks a lot better in terms of multivariate normality.
We will perform the Hotelling Test now that our data is approximately multivariate normal.
HotellingsT2(LASERI[12:15])
##
## Hotelling's one sample T2-test
##
## data: LASERI[12:15]
## T.2 = 3350.5, df1 = 4, df2 = 219, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to c(0,0,0,0)
Here, we have tested the null \(H_0: \mu = (0,0,0,0)^T\) and our alternative hypothesis \(H_a: \mu \neq (0,0,0,0)^T\). As our p-value is clearly less than 0.05, we reject the null hypothesis and claim that the multivariate normal means are not the same.